Summary

This markdown file is performing clincluster on secretory cells, generating the clusters and finally plot the heatmap.

last date checked: 14 Nov 2019

Read data

First we read in the processed dataset.

sceset <- readRDS("../../scFT-paper_rds/20190214_allFT_Clincluster_12clusters_sceset_withUMAP.rds")

# load("../RData/20180725_allFT_Clincluster_12clusters.RData")
# saveRDS(sceset, "../rds/20190214_allFT_Clincluster_12clusters_sceset_withUMAP.rds", compress = T)
# write.csv(as.data.frame(sceset@colData), "../GEO/sceset_colData.csv")
# write.table(counts(sceset), "../GEO/sceset_counts.txt", quote = F, sep = "\t")
dim(sceset)
## [1] 22110  3877
plotTSNE(sceset, colour_by = "final.clusters")
## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.

# UMAP
# too very long time to run
# set.seed(1234)
# sceset <- runUMAP(sceset, ncomponents = 2, feature_set = rowData(sceset)$high.var == T,
#   exprs_values = "logcounts", scale_features = TRUE)
# plotUMAP(sceset, colour_by = "Patient2") + xlab("UMAP_1") + ylab("UMAP_2")

plotUMAP(sceset, colour_by = "final.clusters") + xlab("UMAP_1") + ylab("UMAP_2")
## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.

Plot the UMAP of fresh cells only

p1 <- plotUMAP(sceset[,sceset$source == "Fresh"], colour_by = "Patient") + xlab("UMAP_1") + ylab("UMAP_2")
## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.
p2 <- plotUMAP(sceset[,sceset$source == "Fresh"], colour_by = "EPCAM") + xlab("UMAP_1") + ylab("UMAP_2")
## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.
p3 <- plotUMAP(sceset[,sceset$source == "Fresh"], colour_by = "KRT7") + xlab("UMAP_1") + ylab("UMAP_2")
## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.
cowplot::plot_grid(p1,p2, p3, ncol = 3)

# ggsave("plots/Fig1_UMAP_EPCAMandKRT7_fresh_20190510.png", width = 11, height = 2.6)

Intermediate cell type

p1 <- plotPCA(sceset[,sceset$source == "Fresh" & sceset$Patient != "15066L"], colour_by = "KRT7") + xlab("PC1") + ylab("PC2")
## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.
p2 <- plotPCA(sceset[,sceset$source == "Fresh" & sceset$Patient != "15066L"], colour_by = "PAX8") + xlab("PC1") + ylab("PC2")
## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.
p3 <- plotPCA(sceset[,sceset$source == "Fresh" & sceset$Patient != "15066L"], colour_by = "CCDC17") + xlab("PC1") + ylab("PC2")
## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.
p4 <- plotPCA(sceset[,sceset$source == "Fresh" & sceset$Patient != "15066L"], colour_by = "CAPS") + xlab("PC1") + ylab("PC2")
## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.
cowplot::plot_grid(p1,p2,p3,p4)

# ggsave("plots/SuppFig2_intermediate_PCAs.png")

Filtering to keep good-quality secretory cells

## select the fresh secretory cells
secretory <- sceset[,sceset$final.clusters %in% c(8,9,10) & sceset$source == "Fresh" & sceset$Patient != "15066L"] # dim(secretory)
# [1] 22110  1747
secretory <- secretory[   ,logcounts(secretory)["KRT7",] > 2 & 
                           logcounts(secretory)["EPCAM",] > 2 & 
                           logcounts(secretory)["PTPRC",] == 0 &
                           logcounts(secretory)["CCDC17",] < 1 ]
dim(secretory)
## [1] 22110  1410
# [1] 22110  1410

Markers of ciliated/secretory

# not run
ciliated  <- sceset[,sceset$final.clusters %in% c(11) & sceset$source == "Fresh" & sceset$Patient != "15066L"]
ciliated <- ciliated[   ,logcounts(ciliated)["KRT7",] <= 2 &
                         logcounts(ciliated)["EPCAM",] > 2 &
                         logcounts(ciliated)["PTPRC",] == 0 &
                         logcounts(ciliated)["CCDC17",] >= 1 ]
dim(ciliated)
matrix <- expm1(cbind(logcounts(secretory), logcounts(ciliated)))
keep <- rowSums(matrix > 1) > 5
sum(keep)

dge <- edgeR::DGEList(counts = matrix[keep,]) # make a edgeR object
rm(matrix,keep)
group <- c(rep("SC",1410), rep("CC", 91))
patient <- c(secretory$Patient2, ciliated$Patient2)
design <- model.matrix(~ 0+group + patient)
v <- voom(dge, design, plot = TRUE)
fit <- lmFit(v, design)

cont.matrix <- makeContrasts(contrasts = "groupSC-groupCC",levels=colnames(design))
fit <- contrasts.fit(fit, cont.matrix)
fit <- eBayes(fit)

rls <- topTable(fit, n = Inf, coef = 1, sort = "logFC", lfc = 1, p = 0.05 )
rls$gene <- rownames(rls)

# write.csv(rls, "../tables/TableS4_markers_secretory_ciliated20190214.csv", row.names = F)

Preprocessing the data of fresh secretory cells

Scale data

source("clincluster/clincluster_functions.R")
secretory <- PrepareData(secretory, col.for.cluster = "Patient2", do.scale = T)
## [1] "Normalising the data..."
## [1] "Centre the data"

High variable genes

secretory <- HighVarGenes(secretory)
table(rowData(secretory)$high.var)
## 
## FALSE  TRUE 
## 19721  2389
# FALSE  TRUE 
# 19721  2389 
ggplot(data = data.frame(gene.mean = rowData(secretory)$gene.mean,
                         gene.dispersion = rowData(secretory)$gene.dispersion,
                         high.var = rowData(secretory)$high.var), 
       aes(x = gene.mean, y = gene.dispersion, col = high.var) ) +
      geom_point(alpha=0.4) 

Run tSNE by the log-transformed data

set.seed(1234)
secretory <- runTSNE(object = secretory, ncomponents = 2, 
                     feature_set = rownames(secretory)[rowData(secretory)$high.var],
                     exprs_values = "logcounts", 
                     perplexity = min(50, floor(ncol(secretory)/5)))

the tSNE from log-transformed data is better then the centred data.

Calculate first 20 PCs

Calculate 20 PCs from high variance genes and the log-transformed counts.

set.seed(12345)
secretory <- runPCA(object = secretory, ncomponents = 20, 
                    exprs_values = "logcounts", rand_seed = 12345,
                    feature_set = rownames(secretory)[rowData(secretory)$high.var == TRUE])
plotPCA(secretory)
## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.

Plot the variance by PCs.

plot(1:50, (attr(secretory@reducedDims$PCA, "percentVar")[1:50])*100, pch = 20, xlab = "PC", ylab = "Standard Deviation of PC")

Initial clustering

specClust alllows to estimate several popular spectral clustering algorithms, for an overview see von Luxburg (2007).

The Laplacian is constructed from a from nearest neighbors and there are several kernels available. The eigenvalues and eigenvectors are computed using the binding in igraph to arpack. This should ensure that this algorithm is also feasable for larger datasets as the the the distances used have dimension n*m, where n is the number of observations and m the number of nearest neighbors. The Laplacian is sparse and has roughly n*m elements and only k eigenvectors are computed, where k is the number of centers.

set.seed(123456)
secretory <- InitialCluster(secretory, k = c(4,6,6,6,6), ncomponents = 1:12, n.neighbor = 7, spec.method = "kknn")
## [1] "Got data ready"
## [1] "Initial clustering"
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |=================================================================| 100%
plotTSNE(secretory, colour_by = "Patient2")
## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.

table(secretory$initial.cluster)
## 
## 11543.1 11543.2 11543.3 11543.4 11545.1 11545.2 11545.3 11545.4 11545.5 
##      16      88      56       9      81      86      71      82      18 
## 11545.6 11553.1 11553.2 11553.3 11553.4 11553.5 11553.6 15066.1 15066.2 
##      63       9      90     114     109      23      14      26       9 
## 15066.3 15066.4 15066.5 15066.6 15072.1 15072.2 15072.3 15072.4 15072.5 
##      28      54      48      85      46      13      32      29      47 
## 15072.6 
##      64

Which clustering is better?

Visualisation of initial clusters

p1 <- plotTSNE(secretory[,secretory$Patient2 == 11543], colour_by = "initial.cluster")
## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.
p2 <- plotTSNE(secretory[,secretory$Patient2 == 11545], colour_by = "initial.cluster")
## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.
p3 <- plotTSNE(secretory[,secretory$Patient2 == 11553], colour_by = "initial.cluster")
## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.
p4 <- plotTSNE(secretory[,secretory$Patient2 == 15066], colour_by = "initial.cluster")
## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.
p5 <- plotTSNE(secretory[,secretory$Patient2 == 15072], colour_by = "initial.cluster")
## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.
cowplot::plot_grid(p1,p2,p3,p4,p5,nrow = 2)

Limma

Remove low expressed genes

matrix <- expm1(logcounts(secretory))
keep <- rowSums(matrix > 1) > 5
sum(keep)
## [1] 15508
# 15508
dge <- edgeR::DGEList(counts = matrix[keep,]) # make a edgeR object
rm(matrix,keep)
secretory@colData$Patient2 <- as.factor(secretory@colData$Patient2)
design <- model.matrix(~  0 + initial.cluster, data = secretory@colData)  # Use 0 because we do not need intercept for this linear model
design2 <- model.matrix(~  0 + Patient2, data = secretory@colData)  
colnames(design)
##  [1] "initial.cluster11543.1" "initial.cluster11543.2"
##  [3] "initial.cluster11543.3" "initial.cluster11543.4"
##  [5] "initial.cluster11545.1" "initial.cluster11545.2"
##  [7] "initial.cluster11545.3" "initial.cluster11545.4"
##  [9] "initial.cluster11545.5" "initial.cluster11545.6"
## [11] "initial.cluster11553.1" "initial.cluster11553.2"
## [13] "initial.cluster11553.3" "initial.cluster11553.4"
## [15] "initial.cluster11553.5" "initial.cluster11553.6"
## [17] "initial.cluster15066.1" "initial.cluster15066.2"
## [19] "initial.cluster15066.3" "initial.cluster15066.4"
## [21] "initial.cluster15066.5" "initial.cluster15066.6"
## [23] "initial.cluster15072.1" "initial.cluster15072.2"
## [25] "initial.cluster15072.3" "initial.cluster15072.4"
## [27] "initial.cluster15072.5" "initial.cluster15072.6"

Incoporate patients into contrast matrix

v <- voom(dge, design, plot = F)

fit <- lmFit(v, design) # Linear Model for Series of Arrays

initial.clusters <- data.frame(id = colnames(design),
                               short_id = gsub(pattern = "initial.cluster", 
                                               replacement = "", x = colnames(design)),
                               patient = substr(colnames(design), start = 16, stop = 20))
head(initial.clusters)
##                       id short_id patient
## 1 initial.cluster11543.1  11543.1   11543
## 2 initial.cluster11543.2  11543.2   11543
## 3 initial.cluster11543.3  11543.3   11543
## 4 initial.cluster11543.4  11543.4   11543
## 5 initial.cluster11545.1  11545.1   11545
## 6 initial.cluster11545.2  11545.2   11545
## Automating makeContrasts call in limma
nc <- nrow(initial.clusters)
contrast_all <- gtools::permutations(v = as.character(initial.clusters$id), n = nc, r = 2)
contrast_all <- as.data.frame(contrast_all)
head(contrast_all)
##                       V1                     V2
## 1 initial.cluster11543.1 initial.cluster11543.2
## 2 initial.cluster11543.1 initial.cluster11543.3
## 3 initial.cluster11543.1 initial.cluster11543.4
## 4 initial.cluster11543.1 initial.cluster11545.1
## 5 initial.cluster11543.1 initial.cluster11545.2
## 6 initial.cluster11543.1 initial.cluster11545.3

Number of cells in each initial cluster

table(secretory$initial.cluster)
## 
## 11543.1 11543.2 11543.3 11543.4 11545.1 11545.2 11545.3 11545.4 11545.5 
##      16      88      56       9      81      86      71      82      18 
## 11545.6 11553.1 11553.2 11553.3 11553.4 11553.5 11553.6 15066.1 15066.2 
##      63       9      90     114     109      23      14      26       9 
## 15066.3 15066.4 15066.5 15066.6 15072.1 15072.2 15072.3 15072.4 15072.5 
##      28      54      48      85      46      13      32      29      47 
## 15072.6 
##      64
initial.clusters$n_cells <- table(secretory$initial.cluster)
n_cells_patients <- table(secretory$Patient2)
initial.clusters$n_cells_patients <- n_cells_patients[match(initial.clusters$patient, names(n_cells_patients))]
initial.clusters$weight_cluster <- initial.clusters$n_cells/initial.clusters$n_cells_patients
initial.clusters$paste_weight_id <- paste(initial.clusters$id,"*",initial.clusters$weight_cluster, sep = "")
contrast_all$P1 <- substr(contrast_all$V1, start = 16, stop = 20) # patient 1
contrast_all$P2 <- substr(contrast_all$V2, start = 16, stop = 20) # patient 1
contrast_all$C1 <- NA
contrast_all$C2 <- NA
contrast_all$n_C1 <- NA
contrast_all$n_C2 <- NA
for(i in 1:nrow(contrast_all)) {
    contrast_all$C1[i] <- paste(initial.clusters$paste_weight_id[initial.clusters$patient == contrast_all$P1[i]], collapse = "+")
    contrast_all$C2[i] <- paste(initial.clusters$paste_weight_id[initial.clusters$patient == contrast_all$P2[i]], collapse = "+")
}
head(contrast_all)
##                       V1                     V2    P1    P2
## 1 initial.cluster11543.1 initial.cluster11543.2 11543 11543
## 2 initial.cluster11543.1 initial.cluster11543.3 11543 11543
## 3 initial.cluster11543.1 initial.cluster11543.4 11543 11543
## 4 initial.cluster11543.1 initial.cluster11545.1 11543 11545
## 5 initial.cluster11543.1 initial.cluster11545.2 11543 11545
## 6 initial.cluster11543.1 initial.cluster11545.3 11543 11545
##                                                                                                                                                                      C1
## 1 initial.cluster11543.1*0.0946745562130177+initial.cluster11543.2*0.520710059171598+initial.cluster11543.3*0.331360946745562+initial.cluster11543.4*0.0532544378698225
## 2 initial.cluster11543.1*0.0946745562130177+initial.cluster11543.2*0.520710059171598+initial.cluster11543.3*0.331360946745562+initial.cluster11543.4*0.0532544378698225
## 3 initial.cluster11543.1*0.0946745562130177+initial.cluster11543.2*0.520710059171598+initial.cluster11543.3*0.331360946745562+initial.cluster11543.4*0.0532544378698225
## 4 initial.cluster11543.1*0.0946745562130177+initial.cluster11543.2*0.520710059171598+initial.cluster11543.3*0.331360946745562+initial.cluster11543.4*0.0532544378698225
## 5 initial.cluster11543.1*0.0946745562130177+initial.cluster11543.2*0.520710059171598+initial.cluster11543.3*0.331360946745562+initial.cluster11543.4*0.0532544378698225
## 6 initial.cluster11543.1*0.0946745562130177+initial.cluster11543.2*0.520710059171598+initial.cluster11543.3*0.331360946745562+initial.cluster11543.4*0.0532544378698225
##                                                                                                                                                                                                                                                     C2
## 1                                                                                initial.cluster11543.1*0.0946745562130177+initial.cluster11543.2*0.520710059171598+initial.cluster11543.3*0.331360946745562+initial.cluster11543.4*0.0532544378698225
## 2                                                                                initial.cluster11543.1*0.0946745562130177+initial.cluster11543.2*0.520710059171598+initial.cluster11543.3*0.331360946745562+initial.cluster11543.4*0.0532544378698225
## 3                                                                                initial.cluster11543.1*0.0946745562130177+initial.cluster11543.2*0.520710059171598+initial.cluster11543.3*0.331360946745562+initial.cluster11543.4*0.0532544378698225
## 4 initial.cluster11545.1*0.201995012468828+initial.cluster11545.2*0.214463840399002+initial.cluster11545.3*0.177057356608479+initial.cluster11545.4*0.204488778054863+initial.cluster11545.5*0.0448877805486284+initial.cluster11545.6*0.1571072319202
## 5 initial.cluster11545.1*0.201995012468828+initial.cluster11545.2*0.214463840399002+initial.cluster11545.3*0.177057356608479+initial.cluster11545.4*0.204488778054863+initial.cluster11545.5*0.0448877805486284+initial.cluster11545.6*0.1571072319202
## 6 initial.cluster11545.1*0.201995012468828+initial.cluster11545.2*0.214463840399002+initial.cluster11545.3*0.177057356608479+initial.cluster11545.4*0.204488778054863+initial.cluster11545.5*0.0448877805486284+initial.cluster11545.6*0.1571072319202
##   n_C1 n_C2
## 1   NA   NA
## 2   NA   NA
## 3   NA   NA
## 4   NA   NA
## 5   NA   NA
## 6   NA   NA
contrast_matrix <- apply(contrast_all, MARGIN = 1, function(x) return(paste(x[1],"-",x[2],"-(",x[5],")","+(", x[6],")", sep = "")))
cont.matrix <- makeContrasts(contrasts = contrast_matrix,levels=design)
cont.matrix[,5]
## initial.cluster11543.1 initial.cluster11543.2 initial.cluster11543.3 
##             0.90532544            -0.52071006            -0.33136095 
## initial.cluster11543.4 initial.cluster11545.1 initial.cluster11545.2 
##            -0.05325444             0.20199501            -0.78553616 
## initial.cluster11545.3 initial.cluster11545.4 initial.cluster11545.5 
##             0.17705736             0.20448878             0.04488778 
## initial.cluster11545.6 initial.cluster11553.1 initial.cluster11553.2 
##             0.15710723             0.00000000             0.00000000 
## initial.cluster11553.3 initial.cluster11553.4 initial.cluster11553.5 
##             0.00000000             0.00000000             0.00000000 
## initial.cluster11553.6 initial.cluster15066.1 initial.cluster15066.2 
##             0.00000000             0.00000000             0.00000000 
## initial.cluster15066.3 initial.cluster15066.4 initial.cluster15066.5 
##             0.00000000             0.00000000             0.00000000 
## initial.cluster15066.6 initial.cluster15072.1 initial.cluster15072.2 
##             0.00000000             0.00000000             0.00000000 
## initial.cluster15072.3 initial.cluster15072.4 initial.cluster15072.5 
##             0.00000000             0.00000000             0.00000000 
## initial.cluster15072.6 
##             0.00000000
fit2 <- contrasts.fit(fit, cont.matrix) # Compute Contrasts from Linear Model Fit
fit2 <- eBayes(fit2) 

Add weights to DEGs

The DE gene weight is decided by the fold change and the ratio of expression proportion.

## parameter:
## logFC = 0.6
## p-value = 0.05
## weight = abs(logFC)*(expr_ratio1+0.01)/(expr_ratio2+0.01)
## expr_ratio_max > 0.25

n_deg2 <- matrix(0, ncol = nc, nrow = nc)  # number of DE genes
colnames(n_deg2) <- rownames(n_deg2) <- gsub(x = colnames(design)[1:nc], pattern = "initial.cluster",replacement = "")
for(i in 1:nc) {
    for(j in 1:nc) {
        if(i == j) {
            n_deg2[i,j] <- 0
        } else if (j < i) {
            coef_k = (i-1)*(nc-1)+j
        } else if (j > i) {
            coef_k = (i-1)*(nc-1)+j-1
        }
        
        if(i != j) {
            rls <- topTable(fit2, n = Inf, coef = coef_k, sort = "p", lfc = 0.6, p = 0.05 )
            if(nrow(rls) > 1) {
                v_expr <- logcounts(secretory)[rownames(rls), secretory$initial.cluster == rownames(n_deg2)[i]]
                rls$ratio1 <- rowSums(v_expr > 0.5)/ncol(v_expr)
                v_expr <- logcounts(secretory)[rownames(rls), secretory$initial.cluster == colnames(n_deg2)[j]]
                rls$ratio2 <- rowSums(v_expr > 0.5)/ncol(v_expr)
                rls$ratiomax <- rowMaxs(as.matrix(rls[,c("ratio1", "ratio2")]))
                rls$ratiomin <- rowMins(as.matrix(rls[,c("ratio1", "ratio2")]))
                rls <- rls[rls$ratiomax > 0.25, ]
                n_deg2[i,j] <- sum(apply(rls, MARGIN = 1, function(x) return(abs(x[1]) * (x[9]+0.01)/(x[10]+0.01)))) ## 0.01 is used here to exaggerate the differences of on-off genes
            } else if (nrow(rls) == 1) {
                n_deg2[i,j] <- sum(rls$logFC)
            }
            ## This eqaution take fold change and expression ratio into account
            ## Question: should we talk a upper limit to the weight?
        }
    }
}

# saveRDS(n_deg2, "../../scFT-paper_rds/clincluster_secretory_n_deg2_distMat191113.rds")
## pre-run results
n_deg2 <- readRDS("../../scFT-paper_rds/clincluster_secretory_n_deg2_distMat191113.rds")
# any(is.na(n_deg2))
n_deg2[1:5,1:5]
##           11543.1   11543.2   11543.3   11543.4   11545.1
## 11543.1    0.0000 1683.9316  3216.994  1582.100  663.2267
## 11543.2 1683.9316    0.0000  6382.608  1834.765  461.0399
## 11543.3 3216.9943 6382.6082     0.000 11271.148 1909.0925
## 11543.4 1582.0999 1834.7651 11271.148     0.000 3145.7421
## 11545.1  663.2267  461.0399  1909.092  3145.742    0.0000

Final clustering

## 7 clusters
hc <- hclust(as.dist(n_deg2))
hc.cluster <- cutree(hc, k = 10)

colData(secretory)$clincluster.7clusters <- hc.cluster[match(colData(secretory)$initial.cluster, names(hc.cluster))]
# secretory$clincluster.7clusters[secretory$initial.cluster == "11553.2"] <- 8
colData(secretory)$clincluster.7clusters <- as.factor(colData(secretory)$clincluster.7clusters)
table(colData(secretory)$clincluster.7clusters)
## 
##   1   2   3   4   5   6   7   8   9  10 
##  92 228 167  32 312 272 154  40  90  23
## visualisation
hc <- hclust(as.dist(n_deg2))
plot(hc);rect.hclust(hc, k = 10, border = "red")

plotTSNE(secretory, colour_by = "clincluster.7clusters")
## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.

# tiff("../manuscript_plots/FigureS6A_Clustering_FTESCs_tSNE.tiff", res = 300, height = 10, width = 14, units = "cm")
# plotTSNE(secretory, colour_by = "clincluster")
# dev.off()
# 
# tiff("../manuscript_plots/FigureS6B_Clustering_FTESCs_tSNE_patients.tiff", res = 300, height = 10, width = 14, units = "cm")
# plotTSNE(secretory, colour_by = "Patient2")
# dev.off()
secretory$clincluster_final <- secretory$clincluster.7clusters
secretory$clincluster_final[secretory$clincluster.7clusters == 7] <- 4
secretory$clincluster_final <- paste("C",secretory$clincluster_final,sep = "")

Identifying marker genes

markers2 <- c()
logcounts <- logcounts(secretory)
for(i in 1:length(unique(secretory$clincluster_final))){
    info <- rep("control", ncol(secretory))
    info[secretory$clincluster_final == unique(secretory$clincluster_final)[i]] <- "group"
    design <- model.matrix(~ 0 + info)
    v <- voom(dge, design, plot = F)
    fit <- lmFit(v, design) # Linear Model for Series of Arrays
    cont.matrix <- makeContrasts(contrasts = "infogroup-infocontrol",levels=design)
    fit <- contrasts.fit(fit, cont.matrix ) # Linear Model for Series of Arrays
    fit <- eBayes(fit)
    
    marker <- topTable(fit, p.value = 0.05, number = Inf, coef = 1, lfc = 0.6, sort.by = "logFC")
    marker <- marker[marker$logFC > 0.6,]
     
    v_expr <- logcounts[match(rownames(marker), rownames(logcounts)), info == "group"]
    marker$ratio1 <- rowSums(v_expr > 0.5)/ncol(v_expr)
    v_expr <- logcounts[match(rownames(marker), rownames(logcounts)), info != "group"]
    marker$ratio2 <- rowSums(v_expr > 0.5)/ncol(v_expr)
    marker$gene <- rownames(marker) 
    marker$cluster <- unique(secretory$clincluster_final)[i]
    markers2  <- rbind(markers2, marker)
}
markers2$cluster <- factor(markers2$cluster)
# write.csv(markers2, "../tables/20190120Clincluster_fresh_secretory_9clusters_markers.csv")
markers2 <- read.csv("../../scFT-paper_rds/20190120Clincluster_fresh_secretory_9clusters_markers.csv", as.is = T)
top10 <- markers2 %>% group_by(cluster) %>% top_n(3, logFC)
## Warning: The `printer` argument is deprecated as of rlang 0.3.0.
## This warning is displayed once per session.
knitr::kable(top10)
X logFC AveExpr t P.Value adj.P.Val B ratio1 ratio2 gene cluster
SERINC5 1.956063 7.107523 10.547525 0.0e+00 0.0000000 46.109691 0.9166667 0.7563452 SERINC5 C2
SET 1.501504 8.682341 14.337685 0.0e+00 0.0000000 88.053797 1.0000000 0.9822335 SET C2
CDV3 1.426887 5.157615 7.548573 0.0e+00 0.0000000 20.828222 0.6710526 0.5490694 CDV3 C2
OVGP1 1.415347 5.962832 5.222745 2.0e-07 0.0000462 6.760365 0.5608974 0.4262295 OVGP1 C5
TRIB1 1.125414 6.411907 6.188506 0.0e+00 0.0000004 11.907277 0.8108974 0.6621129 TRIB1 C5
FHL2 1.093554 5.512801 5.650759 0.0e+00 0.0000061 8.934340 0.6698718 0.5018215 FHL2 C5
DPP4 2.243219 4.054546 9.317093 0.0e+00 0.0000000 34.637577 0.6766467 0.2622687 DPP4 C3
LTBP4 2.102409 5.130780 8.724552 0.0e+00 0.0000000 29.709480 0.7964072 0.4489139 LTBP4 C3
SLC25A25 2.074663 5.932589 9.006236 0.0e+00 0.0000000 32.029021 0.9161677 0.5929204 SLC25A25 C3
JUN 2.724953 6.173253 14.027896 0.0e+00 0.0000000 84.025000 0.8566176 0.5755712 JUN C6
FOS 2.510552 11.718297 16.909273 0.0e+00 0.0000000 122.037795 0.9963235 0.9630931 FOS C6
TXNIP 2.184585 3.680430 10.965286 0.0e+00 0.0000000 50.154916 0.5000000 0.1652021 TXNIP C6
ACTA2 6.186113 3.808314 15.277895 0.0e+00 0.0000000 95.334245 0.8750000 0.2547445 ACTA2 C8
TIMP3 5.489491 2.803985 12.653211 0.0e+00 0.0000000 64.819243 0.9000000 0.0802920 TIMP3 C8
A2M 4.561939 2.722170 9.437405 0.0e+00 0.0000000 34.280756 0.6750000 0.0525547 A2M C8
KRT17 3.631127 5.782838 12.343843 0.0e+00 0.0000000 64.801783 0.7580645 0.3839869 KRT17 C4
SNCG 3.486426 6.076391 16.095046 0.0e+00 0.0000000 111.195687 0.9193548 0.5653595 SNCG C4
PIGR 3.355805 3.148991 15.322608 0.0e+00 0.0000000 100.904963 0.6774194 0.0808824 PIGR C4
HLA-DRB51 1.901589 8.147843 7.993135 0.0e+00 0.0000000 23.753971 0.9782609 0.9044006 HLA-DRB5 C1
ZC3H12A2 1.757681 8.082003 6.175338 0.0e+00 0.0000001 11.846008 0.9565217 0.8239757 ZC3H12A C1
CTGF1 1.751632 7.709149 4.707172 2.8e-06 0.0000512 4.356458 0.8695652 0.7116844 CTGF C1
STMN1 4.492856 4.766765 8.135457 0.0e+00 0.0000000 24.553233 0.9565217 0.4419611 STMN1 C10
MCM71 4.318530 4.028890 7.192254 0.0e+00 0.0000000 17.940752 0.9130435 0.2941601 MCM7 C10
OVGP11 4.038045 5.962832 4.990426 7.0e-07 0.0002998 5.636433 0.8260870 0.4498919 OVGP1 C10
CXCL1 5.657529 5.751003 16.819535 0.0e+00 0.0000000 120.111966 0.9444444 0.4500000 CXCL1 C9
SERPINA31 5.175707 4.339827 15.246068 0.0e+00 0.0000000 98.969586 0.8888889 0.2530303 SERPINA3 C9
CCL22 5.119400 6.475462 15.195825 0.0e+00 0.0000000 98.531420 0.9666667 0.5734848 CCL2 C9
my_col <- RColorBrewer::brewer.pal(12, "Paired")[c(2,8,4,6,10,12,11,9,7,5,3,1)]
# secretory$clincluster_final <- as.factor(secretory$clincluster_final)
p1 <- plotTSNE(secretory[,secretory$Patient2 == 11543], colour_by = "clincluster_final") + scale_fill_manual( values = my_col[c(1,2,3,4)])
## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.
## Scale for 'fill' is already present. Adding another scale for 'fill',
## which will replace the existing scale.
p2 <- plotTSNE(secretory[,secretory$Patient2 == 11545], colour_by = "clincluster_final") + scale_fill_manual( values = my_col[c(2,3,4,5,6,8)])
## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.
## Scale for 'fill' is already present. Adding another scale for 'fill',
## which will replace the existing scale.
p3 <- plotTSNE(secretory[,secretory$Patient2 == 11553], colour_by = "clincluster_final") + scale_fill_manual(values = my_col[c(10,4,5,6,8,9)])
## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.
## Scale for 'fill' is already present. Adding another scale for 'fill',
## which will replace the existing scale.
p4 <- plotTSNE(secretory[,secretory$Patient2 == 15066], colour_by = "clincluster_final") + scale_fill_manual( values = my_col[c(10,2,3,4,5,6)])
## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.
## Scale for 'fill' is already present. Adding another scale for 'fill',
## which will replace the existing scale.
p5 <- plotTSNE(secretory[,secretory$Patient2 == 15072], colour_by = "clincluster_final") + scale_fill_manual(values = my_col[c(1,4,5,6,8)])
## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.
## Scale for 'fill' is already present. Adding another scale for 'fill',
## which will replace the existing scale.
cowplot::plot_grid(p1,p2,p3,p4,p5,nrow = 3)

# ggsave("exprloratory_plots/20180121tSNE_Patients_9clusters.pdf", height = 8, width = 7, units = "in", dpi = 150)
table(secretory$clincluster_final, secretory$Patient2)
##      
##       11543 11545 11553 15066 15072
##   C1     16     0     0     0    76
##   C10     0     0    14     9     0
##   C2     88    86     0    54     0
##   C3     56    63     0    48     0
##   C4      9    82    23    26    46
##   C5      0    81   114    85    32
##   C6      0    71   109    28    64
##   C8      0    18     9     0    13
##   C9      0     0    90     0     0

Save data

# saveRDS(secretory, "../rds/20190120Fresh_secretory_9clusters_clincluster.rds", compress = T)
# secretory <- readRDS("../rds/20190120Fresh_secretory_9clusters_clincluster.rds")
# markers2 <- read.csv("../tables/20190120Clincluster_fresh_secretory_9clusters_markers.csv", as.is = T)
set.seed(1234)

Compare to stromal cells (sc16)

dim(stroma)
## [1] 23710    91
matrix <- counts(secretory)[,secretory$clincluster_final == "C8"]
matrix <- cbind(matrix,counts(stroma)[match(rownames(matrix), rownames(stroma)),] )
matrix <- cpm(matrix)
dge <- edgeR::DGEList(counts = matrix[rowSums(matrix) > 5, ])
info <- c(rep("C8", 40), rep("str",91))
design <- model.matrix(~ 0 + info)
v <- voom(dge, design, plot = F)
fit <- lmFit(v, design) # Linear Model for Series of Arrays
cont.matrix <- makeContrasts(contrasts = "infoC8-infostr",levels=design)
fit <- contrasts.fit(fit, cont.matrix ) # Linear Model for Series of Arrays
fit <- eBayes(fit)

C8.m <- topTable(fit, p.value = 0.05, number = Inf, coef = 1, lfc = 0.6, sort.by = "logFC")
C8.m$gene <- rownames(C8.m)
which(markers2$gene[markers2$cluster == "C8"] %in% C8.m$gene[C8.m$logFC > 0])
##  [1]   3   4  20  25  42  63  68  74  76  81  82  85  87  96 108 121 124
## [18] 133 157
matrix <- counts(secretory)[,secretory$clincluster_final == "C8"]
matrix <- cbind(matrix, counts(secretory)[,secretory$clincluster_final %in% c("C3","C4","C10")])
matrix <- cbind(matrix,counts(stroma)[match(rownames(matrix), rownames(stroma)),] )
info <- c(rep("C8",sum(secretory$clincluster_final == "C8")),
           rep("secretory", sum(secretory$clincluster_final %in% c("C3","C4","C10"))),
          rep("str",91))
matrix <- cpm(matrix)
dge <- edgeR::DGEList(counts = matrix[rowSums(matrix) > 5, ])

design <- model.matrix(~ 0 + info)
v <- voom(dge, design, plot = F)
fit <- lmFit(v, design) # Linear Model for Series of Arrays
cont.matrix <- makeContrasts(contrasts = c("infoC8-infosecretory","infostr-infosecretory","infostr-infoC8"),levels=design)
fit <- contrasts.fit(fit, cont.matrix) # Linear Model for Series of Arrays
fit <- eBayes(fit)

StrvsSec.m <- topTable(fit, p.value = 0.05, number = Inf, coef = 2, lfc = 0.6, sort.by = "logFC")
StrvsSec.m$gene <- rownames(StrvsSec.m)

C8vsSec.m <- topTable(fit, p.value = 0.05, number = Inf, coef = 1, lfc = 0.6, sort.by = "logFC")
C8vsSec.m$gene <- rownames(C8vsSec.m)

StrvsC8.m <- topTable(fit, p.value = 0.05, number = Inf, coef = 1, lfc = 0.6, sort.by = "logFC")
StrvsC8.m$gene <- rownames(StrvsC8.m)
stroma_control <- SingleCellExperiment(assays = list(counts = matrix ), colData = data.frame(Type = info))
logcounts(stroma_control) <- log1p(calculateCPM(stroma_control))

plotExpression(stroma_control, features = c(markers2$gene[markers2$cluster == "C8"][which(markers2$gene[markers2$cluster == "C8"] %in% C8.m$gene[C8.m$logFC > 0])]), x = "Type", ncol = 4, scales = "free")

plotExpression(stroma_control, features = c("COL1A2","COL3A1"), x = "Type", ncol = 2, scales = "free")  + 
    scale_x_discrete(labels = c("EMT","Other FTESCs","Stroma"),breaks = c("C8","secretory","str")) + 
    theme(strip.background = element_rect(fill = "white"),strip.text.x = element_text(face = "italic", size = 12))

# ggsave("plots/SuppFig3_EMT_stroma_col1a.png", height = 3, width = 5)
plotExpression(stroma_control, features = c("SPARC","RGS16","COL1A2","COL3A1","EPCAM","KRT7"), 
               x = "Type", ncol = 2, scales = "free") + 
    scale_x_discrete(labels = c("EMT","Other FTESCs","Stroma"),breaks = c("C8","secretory","str")) + 
    theme(strip.background = element_rect(fill = "white"),strip.text.x = element_text(face = "italic", size = 12))

# ggsave("plots/SuppFig3_EMT_markers_with_stroma_control.png", width = 6, height = 8)

Using DoubletFinder

fresh <- sceset[,sceset$source == "Fresh"]

fresh$cell_type <- NA
fresh$cell_type[colnames(fresh) %in% colnames(sceset)[sceset$final.clusters == 11]] <- "Ciliated"
fresh$cell_type[colnames(fresh) %in% colnames(secretory)] <- "Secretory"

table(fresh$cell_type)
## 
##  Ciliated Secretory 
##       146      1410
library(DoubletFinder) # 2.0.1
# devtools::install_version(package = 'Seurat', version = package_version('2.3.4'))
library(Seurat) # 2.3.4
## Loading required package: cowplot
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggplot2':
## 
##     ggsave
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:S4Vectors':
## 
##     expand
fresh_seu <- Seurat::as.seurat(fresh)
sc16_seu <- Seurat::as.seurat(sc16)
sc16_seu@meta.data$Source <- "Fresh"
fresh_seu@meta.data <- fresh_seu@meta.data[,colnames(fresh_seu@meta.data) %in% colnames(sc16_seu@meta.data)]
sc16_seu@meta.data <- sc16_seu@meta.data[,match(colnames(fresh_seu@meta.data), colnames(sc16_seu@meta.data))]

fresh_seu@meta.data <- fresh_seu@meta.data [,c(1,2,3,4,5,21)]
sc16_seu@meta.data <- sc16_seu@meta.data [,c(1,2,3,4,5,21)]

seurat <- Seurat::MergeSeurat(object1 = fresh_seu, object2 =  sc16_seu)
seurat <- FindVariableGenes(seurat)
seurat <- ScaleData(seurat)
seurat <- RunPCA(seurat, pc.genes =  seurat@var.genes)

fresh2 <- as.SingleCellExperiment(seurat)
set.seed(12334);fresh2 <- runTSNE(fresh2)
plotTSNE(fresh2, colour_by  = "PTPRC")
seurat <- NormalizeData(seurat)
seurat <- FindVariableGenes(seurat, x.low.cutoff = 0.0125, y.cutoff = 0.25, do.plot=FALSE)
seurat <- ScaleData(object = seurat, genes.use = seurat@var.genes)
seurat <- RunPCA(seurat, pc.genes = seurat@var.genes, pcs.print = 0)
seurat <- RunTSNE(seurat, dims.use = 1:10, verbose=TRUE)
DimElbowPlot(seurat)
seurat <- FindClusters(object = seurat, reduction.type = "pca", dims.use = 1:10, 
    resolution = 0.3, print.output = 0, save.SNN = TRUE, force.recalc = T)
TSNEPlot(object = seurat)
## pK Identification
sweep.res.list <- paramSweep(seurat, PCs = 1:10)
sweep.stats <- summarizeSweep(sweep.res.list, GT = FALSE)
bcmvn <- find.pK(sweep.stats) #0.07

## Homotypic Doublet Proportion Estimate
annotations <- seurat@ident
homotypic.prop <- modelHomotypic(annotations)           ## ex: annotations <- seu_kidney@meta.data$ClusteringResults
nExp_poi <- round(0.01*length(seurat@cell.names))  ## 1% based on 2000 cells 
nExp_poi.adj <- round(nExp_poi*(1-homotypic.prop))

## Run DoubletFinder with varying classification stringencies
seurat <- doubletFinder(seurat, PCs = 1:10, pN = 0.25, pK = 0.07, nExp = nExp_poi, reuse.pANN = FALSE)
seurat <- doubletFinder(seurat, PCs = 1:10, pN = 0.25, pK = 0.07, nExp = nExp_poi.adj, reuse.pANN = "pANN_0.25_0.07_25")
# saveRDS(seurat,"rds/20190508Seurat_all_fresh_doublefinder.rds", compress = T)
seurat <- readRDS("../../scFT-paper_rds/20190508Seurat_all_fresh_doublefinder.rds")
## Plot results
seurat@meta.data$DF_hi.lo <- seurat@meta.data$DF.classifications_0.25_0.07_25
seurat@meta.data$DF_hi.lo[which(seurat@meta.data$DF_hi.lo == "Doublet" & seurat@meta.data$DF.classifications_0.25_0.07_21 == "Singlet")] <- "Doublet_lo"
seurat@meta.data$DF_hi.lo[which(seurat@meta.data$DF_hi.lo == "Doublet")] <- "Doublet_hi"
TSNEPlot(seurat, group.by="DF_hi.lo", plot.order=c("Doublet_hi","Doublet_lo","Singlet"), colors.use=c("black","gold","red"))

seurat@meta.data$cell_type <- NA
seurat@meta.data$cell_type <- fresh$cell_type[match(rownames(seurat@meta.data), colnames(fresh))]
seurat@meta.data$cell_subtype <- secretory$clincluster_final[match(rownames(seurat@meta.data), colnames(secretory))]

# table(seurat@meta.data$cell_subtype, seurat@meta.data$DF_hi.lo)
secretory$DF_hi.lo <- seurat@meta.data$DF_hi.lo[match(colnames(secretory), rownames(seurat@meta.data))]
knitr::kable(table(secretory$DF_hi.lo[secretory$clincluster_final == "C8"]), caption = "The 40 cells in the EMT cluster are all singlet.")
The 40 cells in the EMT cluster are all singlet.
Var1 Freq
Singlet 40

Technical

sessionInfo()
## R version 3.5.2 (2018-12-20)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.4
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] Seurat_2.3.4                Matrix_1.2-15              
##  [3] cowplot_0.9.4               DoubletFinder_2.0.1        
##  [5] bindrcpp_0.2.2              reshape2_1.4.3             
##  [7] scales_1.0.0                dplyr_0.7.8                
##  [9] edgeR_3.24.3                limma_3.38.3               
## [11] scater_1.10.1               ggplot2_3.2.0.9000         
## [13] SingleCellExperiment_1.4.1  SummarizedExperiment_1.12.0
## [15] DelayedArray_0.8.0          BiocParallel_1.16.5        
## [17] matrixStats_0.54.0          Biobase_2.42.0             
## [19] GenomicRanges_1.34.0        GenomeInfoDb_1.18.1        
## [21] IRanges_2.16.0              S4Vectors_0.20.1           
## [23] BiocGenerics_0.28.0        
## 
## loaded via a namespace (and not attached):
##   [1] snow_0.4-3               backports_1.1.3         
##   [3] Hmisc_4.2-0              plyr_1.8.4              
##   [5] igraph_1.2.3             lazyeval_0.2.1          
##   [7] splines_3.5.2            digest_0.6.18           
##   [9] foreach_1.4.4            htmltools_0.3.6         
##  [11] viridis_0.5.1            lars_1.2                
##  [13] gdata_2.18.0             magrittr_1.5            
##  [15] checkmate_1.9.1          cluster_2.0.7-1         
##  [17] mixtools_1.1.0           ROCR_1.0-7              
##  [19] R.utils_2.7.0            colorspace_1.4-0        
##  [21] xfun_0.4                 jsonlite_1.6            
##  [23] crayon_1.3.4             RCurl_1.95-4.11         
##  [25] bindr_0.1.1              zoo_1.8-4               
##  [27] survival_2.43-3          iterators_1.0.10        
##  [29] ape_5.2                  glue_1.3.0              
##  [31] gtable_0.2.0             zlibbioc_1.28.0         
##  [33] XVector_0.22.0           kernlab_0.9-27          
##  [35] Rhdf5lib_1.4.2           prabclus_2.2-7          
##  [37] DEoptimR_1.0-8           HDF5Array_1.10.1        
##  [39] mvtnorm_1.0-8            bibtex_0.4.2            
##  [41] Rcpp_1.0.0               metap_1.1               
##  [43] dtw_1.20-1               viridisLite_0.3.0       
##  [45] htmlTable_1.13.1         reticulate_1.10         
##  [47] foreign_0.8-71           bit_1.1-14              
##  [49] proxy_0.4-22             mclust_5.4.2            
##  [51] SDMTools_1.1-221         Formula_1.2-3           
##  [53] tsne_0.1-3               htmlwidgets_1.3         
##  [55] httr_1.4.0               gplots_3.0.1.1          
##  [57] RColorBrewer_1.1-2       fpc_2.1-11.1            
##  [59] acepack_1.4.1            modeltools_0.2-22       
##  [61] ica_1.0-2                pkgconfig_2.0.2         
##  [63] R.methodsS3_1.7.1        flexmix_2.3-14          
##  [65] nnet_7.3-12              locfit_1.5-9.1          
##  [67] tidyselect_0.2.5         labeling_0.3            
##  [69] rlang_0.4.0              munsell_0.5.0           
##  [71] tools_3.5.2              ggridges_0.5.1          
##  [73] evaluate_0.13            stringr_1.4.0           
##  [75] yaml_2.2.0               kknn_1.3.1              
##  [77] npsurv_0.4-0             knitr_1.21              
##  [79] bit64_0.9-7              fitdistrplus_1.0-14     
##  [81] robustbase_0.93-3        caTools_1.17.1.1        
##  [83] purrr_0.3.0              RANN_2.6.1              
##  [85] pbapply_1.4-0            nlme_3.1-137            
##  [87] R.oo_1.22.0              hdf5r_1.0.1             
##  [89] compiler_3.5.2           rstudioapi_0.9.0        
##  [91] png_0.1-7                beeswarm_0.2.3          
##  [93] lsei_1.2-0               tibble_2.0.1            
##  [95] stringi_1.2.4            highr_0.7               
##  [97] lattice_0.20-38          trimcluster_0.1-2.1     
##  [99] pillar_1.3.1             lmtest_0.9-36           
## [101] Rdpack_0.10-1            irlba_2.3.3             
## [103] data.table_1.12.0        bitops_1.0-6            
## [105] gbRd_0.4-11              R6_2.3.0                
## [107] latticeExtra_0.6-28      KernSmooth_2.23-15      
## [109] gridExtra_2.3            vipor_0.4.5             
## [111] codetools_0.2-16         MASS_7.3-51.1           
## [113] gtools_3.8.1             assertthat_0.2.0        
## [115] rhdf5_2.26.2             withr_2.1.2             
## [117] GenomeInfoDbData_1.2.0   diptest_0.75-7          
## [119] doSNOW_1.0.16            grid_3.5.2              
## [121] rpart_4.1-13             tidyr_0.8.2             
## [123] class_7.3-15             rmarkdown_1.11          
## [125] DelayedMatrixStats_1.4.0 segmented_0.5-3.0       
## [127] Rtsne_0.15               base64enc_0.1-3         
## [129] ggbeeswarm_0.6.0